Exploratory Data Analysis

# Set Seed to Save Models:
set.seed(1)

# check head of dataframe:
head(data, n = 10)
# Look at structure of dataframe:
str(data)
## 'data.frame':    2925 obs. of  74 variables:
##  $ MS.Zoning      : chr  "RL" "RH" "RL" "RL" ...
##  $ Lot.Frontage   : int  141 80 81 93 74 78 41 43 39 60 ...
##  $ Lot.Area       : int  31770 11622 14267 11160 13830 9978 4920 5005 5389 7500 ...
##  $ Street         : chr  "Pave" "Pave" "Pave" "Pave" ...
##  $ Alley          : chr  "None" "None" "None" "None" ...
##  $ Lot.Shape      : chr  "IR1" "Reg" "IR1" "Reg" ...
##  $ Land.Contour   : chr  "Lvl" "Lvl" "Lvl" "Lvl" ...
##  $ Lot.Config     : chr  "Corner" "Inside" "Corner" "Corner" ...
##  $ Land.Slope     : chr  "Gtl" "Gtl" "Gtl" "Gtl" ...
##  $ Condition.1    : chr  "Norm" "Feedr" "Norm" "Norm" ...
##  $ Condition.2    : chr  "Norm" "Norm" "Norm" "Norm" ...
##  $ Bldg.Type      : chr  "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ House.Style    : chr  "1Story" "1Story" "1Story" "1Story" ...
##  $ Overall.Qual   : int  6 5 6 7 5 6 8 8 8 7 ...
##  $ Overall.Cond   : int  5 6 6 5 5 6 5 5 5 5 ...
##  $ Year.Built     : int  1960 1961 1958 1968 1997 1998 2001 1992 1995 1999 ...
##  $ Year.Remod.Add : int  1960 1961 1958 1968 1998 1998 2001 1992 1996 1999 ...
##  $ Roof.Style     : chr  "Hip" "Gable" "Hip" "Hip" ...
##  $ Roof.Matl      : chr  "CompShg" "CompShg" "CompShg" "CompShg" ...
##  $ Exterior.1st   : chr  "BrkFace" "VinylSd" "Wood" "BrkFace" ...
##  $ Exterior.2nd   : chr  "Plywood" "VinylSd" "wood" "BrkFace" ...
##  $ Mas.Vnr.Type   : chr  "Stone" "None" "BrkFace" "None" ...
##  $ Mas.Vnr.Area   : int  112 0 108 0 0 20 0 0 0 0 ...
##  $ Exter.Qual     : chr  "TA" "TA" "TA" "Gd" ...
##  $ Exter.Cond     : chr  "TA" "TA" "TA" "TA" ...
##  $ Foundation     : chr  "CBlock" "CBlock" "CBlock" "CBlock" ...
##  $ Bsmt.Qual      : chr  "TA" "TA" "TA" "TA" ...
##  $ Bsmt.Cond      : chr  "Gd" "TA" "TA" "TA" ...
##  $ Bsmt.Exposure  : chr  "Gd" "No" "No" "No" ...
##  $ BsmtFin.Type.1 : chr  "BLQ" "Rec" "ALQ" "ALQ" ...
##  $ BsmtFin.SF.1   : int  639 468 923 1065 791 602 616 263 1180 0 ...
##  $ BsmtFin.Type.2 : chr  "Unf" "LwQ" "Unf" "Unf" ...
##  $ BsmtFin.SF.2   : int  0 144 0 0 0 0 0 0 0 0 ...
##  $ Bsmt.Unf.SF    : int  441 270 406 1045 137 324 722 1017 415 994 ...
##  $ Total.Bsmt.SF  : int  1080 882 1329 2110 928 926 1338 1280 1595 994 ...
##  $ Heating        : chr  "Gas" "Gas" "Gas" "Gas" ...
##  $ Heating.QC     : chr  "Fa" "TA" "TA" "Ex" ...
##  $ Central.Air    : chr  "Y" "Y" "Y" "Y" ...
##  $ Electrical     : chr  "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
##  $ X1st.Flr.SF    : int  1656 896 1329 2110 928 926 1338 1280 1616 1028 ...
##  $ X2nd.Flr.SF    : int  0 0 0 0 701 678 0 0 0 776 ...
##  $ Low.Qual.Fin.SF: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Gr.Liv.Area    : int  1656 896 1329 2110 1629 1604 1338 1280 1616 1804 ...
##  $ Bsmt.Full.Bath : int  1 0 0 1 0 0 1 0 1 0 ...
##  $ Bsmt.Half.Bath : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Full.Bath      : int  1 1 1 2 2 2 2 2 2 2 ...
##  $ Half.Bath      : int  0 0 1 1 1 1 0 0 0 1 ...
##  $ Bedroom.AbvGr  : int  3 2 3 3 3 3 2 2 2 3 ...
##  $ Kitchen.AbvGr  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Kitchen.Qual   : chr  "TA" "TA" "Gd" "Ex" ...
##  $ TotRms.AbvGrd  : int  7 5 6 8 6 7 6 5 5 7 ...
##  $ Functional     : chr  "Typ" "Typ" "Typ" "Typ" ...
##  $ Fireplaces     : int  2 0 0 2 1 1 0 0 1 1 ...
##  $ Fireplace.Qu   : chr  "Gd" "None" "None" "TA" ...
##  $ Garage.Type    : chr  "Attchd" "Attchd" "Attchd" "Attchd" ...
##  $ Garage.Yr.Blt  : int  1960 1961 1958 1968 1997 1998 2001 1992 1995 1999 ...
##  $ Garage.Finish  : chr  "Fin" "Unf" "Unf" "Fin" ...
##  $ Garage.Cars    : int  2 1 1 2 2 2 2 2 2 2 ...
##  $ Garage.Area    : int  528 730 312 522 482 470 582 506 608 442 ...
##  $ Garage.Qual    : chr  "TA" "TA" "TA" "TA" ...
##  $ Garage.Cond    : chr  "TA" "TA" "TA" "TA" ...
##  $ Paved.Drive    : chr  "P" "Y" "Y" "Y" ...
##  $ Wood.Deck.SF   : int  210 140 393 0 212 360 0 0 237 140 ...
##  $ Open.Porch.SF  : int  62 0 36 0 34 36 0 82 152 60 ...
##  $ Enclosed.Porch : int  0 0 0 0 0 0 170 0 0 0 ...
##  $ X3Ssn.Porch    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Screen.Porch   : int  0 120 0 0 0 0 0 144 0 0 ...
##  $ Pool.Area      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Fence          : chr  "None" "MnPrv" "None" "None" ...
##  $ Misc.Val       : int  0 0 12500 0 0 0 0 0 0 0 ...
##  $ Yr.Sold        : int  2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
##  $ Sale.Type      : chr  "WD " "WD " "WD " "WD " ...
##  $ Sale.Condition : chr  "Normal" "Normal" "Normal" "Normal" ...
##  $ SalePrice      : int  215000 105000 172000 244000 189900 195500 213500 191500 236500 189000 ...
# Look at missing values in dataframe:
summary(data)
##   MS.Zoning          Lot.Frontage       Lot.Area         Street         
##  Length:2925        Min.   : 21.00   Min.   :  1300   Length:2925       
##  Class :character   1st Qu.: 58.00   1st Qu.:  7438   Class :character  
##  Mode  :character   Median : 68.00   Median :  9428   Mode  :character  
##                     Mean   : 69.07   Mean   : 10108                     
##                     3rd Qu.: 80.00   3rd Qu.: 11520                     
##                     Max.   :313.00   Max.   :215245                     
##                     NA's   :490                                         
##     Alley            Lot.Shape         Land.Contour        Lot.Config       
##  Length:2925        Length:2925        Length:2925        Length:2925       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   Land.Slope        Condition.1        Condition.2         Bldg.Type        
##  Length:2925        Length:2925        Length:2925        Length:2925       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  House.Style         Overall.Qual     Overall.Cond     Year.Built  
##  Length:2925        Min.   : 1.000   Min.   :1.000   Min.   :1872  
##  Class :character   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954  
##  Mode  :character   Median : 6.000   Median :5.000   Median :1973  
##                     Mean   : 6.094   Mean   :5.565   Mean   :1971  
##                     3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2001  
##                     Max.   :10.000   Max.   :9.000   Max.   :2010  
##                                                                    
##  Year.Remod.Add  Roof.Style         Roof.Matl         Exterior.1st      
##  Min.   :1950   Length:2925        Length:2925        Length:2925       
##  1st Qu.:1965   Class :character   Class :character   Class :character  
##  Median :1993   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1984                                                           
##  3rd Qu.:2004                                                           
##  Max.   :2010                                                           
##                                                                         
##  Exterior.2nd       Mas.Vnr.Type        Mas.Vnr.Area     Exter.Qual       
##  Length:2925        Length:2925        Min.   :   0.0   Length:2925       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median :   0.0   Mode  :character  
##                                        Mean   : 101.1                     
##                                        3rd Qu.: 164.0                     
##                                        Max.   :1600.0                     
##                                        NA's   :23                         
##   Exter.Cond         Foundation         Bsmt.Qual          Bsmt.Cond        
##  Length:2925        Length:2925        Length:2925        Length:2925       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Bsmt.Exposure      BsmtFin.Type.1      BsmtFin.SF.1    BsmtFin.Type.2    
##  Length:2925        Length:2925        Min.   :   0.0   Length:2925       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median : 370.0   Mode  :character  
##                                        Mean   : 439.3                     
##                                        3rd Qu.: 734.0                     
##                                        Max.   :2288.0                     
##                                        NA's   :1                          
##   BsmtFin.SF.2      Bsmt.Unf.SF     Total.Bsmt.SF    Heating         
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0   Length:2925       
##  1st Qu.:   0.00   1st Qu.: 219.0   1st Qu.: 793   Class :character  
##  Median :   0.00   Median : 464.5   Median : 990   Mode  :character  
##  Mean   :  49.81   Mean   : 559.2   Mean   :1048                     
##  3rd Qu.:   0.00   3rd Qu.: 801.2   3rd Qu.:1300                     
##  Max.   :1526.00   Max.   :2336.0   Max.   :3206                     
##  NA's   :1         NA's   :1        NA's   :1                        
##   Heating.QC        Central.Air         Electrical         X1st.Flr.SF  
##  Length:2925        Length:2925        Length:2925        Min.   : 334  
##  Class :character   Class :character   Class :character   1st Qu.: 877  
##  Mode  :character   Mode  :character   Mode  :character   Median :1084  
##                                                           Mean   :1157  
##                                                           3rd Qu.:1383  
##                                                           Max.   :3820  
##                                                                         
##   X2nd.Flr.SF     Low.Qual.Fin.SF     Gr.Liv.Area   Bsmt.Full.Bath  
##  Min.   :   0.0   Min.   :   0.000   Min.   : 334   Min.   :0.0000  
##  1st Qu.:   0.0   1st Qu.:   0.000   1st Qu.:1126   1st Qu.:0.0000  
##  Median :   0.0   Median :   0.000   Median :1442   Median :0.0000  
##  Mean   : 335.2   Mean   :   4.685   Mean   :1496   Mean   :0.4307  
##  3rd Qu.: 703.0   3rd Qu.:   0.000   3rd Qu.:1742   3rd Qu.:1.0000  
##  Max.   :2065.0   Max.   :1064.000   Max.   :4476   Max.   :3.0000  
##                                                     NA's   :2       
##  Bsmt.Half.Bath     Full.Bath       Half.Bath      Bedroom.AbvGr  
##  Min.   :0.0000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :0.0000   Median :2.000   Median :0.0000   Median :3.000  
##  Mean   :0.0609   Mean   :1.566   Mean   :0.3791   Mean   :2.855  
##  3rd Qu.:0.0000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :2.0000   Max.   :4.000   Max.   :2.0000   Max.   :8.000  
##  NA's   :2                                                        
##  Kitchen.AbvGr   Kitchen.Qual       TotRms.AbvGrd     Functional       
##  Min.   :0.000   Length:2925        Min.   : 2.000   Length:2925       
##  1st Qu.:1.000   Class :character   1st Qu.: 5.000   Class :character  
##  Median :1.000   Mode  :character   Median : 6.000   Mode  :character  
##  Mean   :1.044                      Mean   : 6.438                     
##  3rd Qu.:1.000                      3rd Qu.: 7.000                     
##  Max.   :3.000                      Max.   :14.000                     
##                                                                        
##    Fireplaces     Fireplace.Qu       Garage.Type        Garage.Yr.Blt 
##  Min.   :0.0000   Length:2925        Length:2925        Min.   :1895  
##  1st Qu.:0.0000   Class :character   Class :character   1st Qu.:1960  
##  Median :1.0000   Mode  :character   Mode  :character   Median :1979  
##  Mean   :0.5979                                         Mean   :1978  
##  3rd Qu.:1.0000                                         3rd Qu.:2002  
##  Max.   :4.0000                                         Max.   :2207  
##                                                         NA's   :159   
##  Garage.Finish       Garage.Cars     Garage.Area   Garage.Qual       
##  Length:2925        Min.   :0.000   Min.   :   0   Length:2925       
##  Class :character   1st Qu.:1.000   1st Qu.: 320   Class :character  
##  Mode  :character   Median :2.000   Median : 480   Mode  :character  
##                     Mean   :1.766   Mean   : 472                     
##                     3rd Qu.:2.000   3rd Qu.: 576                     
##                     Max.   :5.000   Max.   :1488                     
##                     NA's   :1       NA's   :1                        
##  Garage.Cond        Paved.Drive         Wood.Deck.SF     Open.Porch.SF   
##  Length:2925        Length:2925        Min.   :   0.00   Min.   :  0.00  
##  Class :character   Class :character   1st Qu.:   0.00   1st Qu.:  0.00  
##  Mode  :character   Mode  :character   Median :   0.00   Median : 27.00  
##                                        Mean   :  93.58   Mean   : 47.21  
##                                        3rd Qu.: 168.00   3rd Qu.: 70.00  
##                                        Max.   :1424.00   Max.   :742.00  
##                                                                          
##  Enclosed.Porch     X3Ssn.Porch       Screen.Porch      Pool.Area      
##  Min.   :   0.00   Min.   :  0.000   Min.   :  0.00   Min.   :  0.000  
##  1st Qu.:   0.00   1st Qu.:  0.000   1st Qu.:  0.00   1st Qu.:  0.000  
##  Median :   0.00   Median :  0.000   Median :  0.00   Median :  0.000  
##  Mean   :  23.05   Mean   :  2.597   Mean   : 16.03   Mean   :  2.083  
##  3rd Qu.:   0.00   3rd Qu.:  0.000   3rd Qu.:  0.00   3rd Qu.:  0.000  
##  Max.   :1012.00   Max.   :508.000   Max.   :576.00   Max.   :800.000  
##                                                                        
##     Fence              Misc.Val           Yr.Sold      Sale.Type        
##  Length:2925        Min.   :    0.00   Min.   :2006   Length:2925       
##  Class :character   1st Qu.:    0.00   1st Qu.:2007   Class :character  
##  Mode  :character   Median :    0.00   Median :2008   Mode  :character  
##                     Mean   :   44.91   Mean   :2008                     
##                     3rd Qu.:    0.00   3rd Qu.:2009                     
##                     Max.   :15500.00   Max.   :2010                     
##                                                                         
##  Sale.Condition       SalePrice     
##  Length:2925        Min.   : 34900  
##  Class :character   1st Qu.:129500  
##  Mode  :character   Median :160000  
##                     Mean   :180916  
##                     3rd Qu.:213500  
##                     Max.   :755000  
## 
# Remove Missing Values from Data:
data2= na.omit(data)
# check structure of dataframe:
str(data2)
## 'data.frame':    2258 obs. of  74 variables:
##  $ MS.Zoning      : chr  "RL" "RH" "RL" "RL" ...
##  $ Lot.Frontage   : int  141 80 81 93 74 78 41 43 39 60 ...
##  $ Lot.Area       : int  31770 11622 14267 11160 13830 9978 4920 5005 5389 7500 ...
##  $ Street         : chr  "Pave" "Pave" "Pave" "Pave" ...
##  $ Alley          : chr  "None" "None" "None" "None" ...
##  $ Lot.Shape      : chr  "IR1" "Reg" "IR1" "Reg" ...
##  $ Land.Contour   : chr  "Lvl" "Lvl" "Lvl" "Lvl" ...
##  $ Lot.Config     : chr  "Corner" "Inside" "Corner" "Corner" ...
##  $ Land.Slope     : chr  "Gtl" "Gtl" "Gtl" "Gtl" ...
##  $ Condition.1    : chr  "Norm" "Feedr" "Norm" "Norm" ...
##  $ Condition.2    : chr  "Norm" "Norm" "Norm" "Norm" ...
##  $ Bldg.Type      : chr  "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ House.Style    : chr  "1Story" "1Story" "1Story" "1Story" ...
##  $ Overall.Qual   : int  6 5 6 7 5 6 8 8 8 7 ...
##  $ Overall.Cond   : int  5 6 6 5 5 6 5 5 5 5 ...
##  $ Year.Built     : int  1960 1961 1958 1968 1997 1998 2001 1992 1995 1999 ...
##  $ Year.Remod.Add : int  1960 1961 1958 1968 1998 1998 2001 1992 1996 1999 ...
##  $ Roof.Style     : chr  "Hip" "Gable" "Hip" "Hip" ...
##  $ Roof.Matl      : chr  "CompShg" "CompShg" "CompShg" "CompShg" ...
##  $ Exterior.1st   : chr  "BrkFace" "VinylSd" "Wood" "BrkFace" ...
##  $ Exterior.2nd   : chr  "Plywood" "VinylSd" "wood" "BrkFace" ...
##  $ Mas.Vnr.Type   : chr  "Stone" "None" "BrkFace" "None" ...
##  $ Mas.Vnr.Area   : int  112 0 108 0 0 20 0 0 0 0 ...
##  $ Exter.Qual     : chr  "TA" "TA" "TA" "Gd" ...
##  $ Exter.Cond     : chr  "TA" "TA" "TA" "TA" ...
##  $ Foundation     : chr  "CBlock" "CBlock" "CBlock" "CBlock" ...
##  $ Bsmt.Qual      : chr  "TA" "TA" "TA" "TA" ...
##  $ Bsmt.Cond      : chr  "Gd" "TA" "TA" "TA" ...
##  $ Bsmt.Exposure  : chr  "Gd" "No" "No" "No" ...
##  $ BsmtFin.Type.1 : chr  "BLQ" "Rec" "ALQ" "ALQ" ...
##  $ BsmtFin.SF.1   : int  639 468 923 1065 791 602 616 263 1180 0 ...
##  $ BsmtFin.Type.2 : chr  "Unf" "LwQ" "Unf" "Unf" ...
##  $ BsmtFin.SF.2   : int  0 144 0 0 0 0 0 0 0 0 ...
##  $ Bsmt.Unf.SF    : int  441 270 406 1045 137 324 722 1017 415 994 ...
##  $ Total.Bsmt.SF  : int  1080 882 1329 2110 928 926 1338 1280 1595 994 ...
##  $ Heating        : chr  "Gas" "Gas" "Gas" "Gas" ...
##  $ Heating.QC     : chr  "Fa" "TA" "TA" "Ex" ...
##  $ Central.Air    : chr  "Y" "Y" "Y" "Y" ...
##  $ Electrical     : chr  "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
##  $ X1st.Flr.SF    : int  1656 896 1329 2110 928 926 1338 1280 1616 1028 ...
##  $ X2nd.Flr.SF    : int  0 0 0 0 701 678 0 0 0 776 ...
##  $ Low.Qual.Fin.SF: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Gr.Liv.Area    : int  1656 896 1329 2110 1629 1604 1338 1280 1616 1804 ...
##  $ Bsmt.Full.Bath : int  1 0 0 1 0 0 1 0 1 0 ...
##  $ Bsmt.Half.Bath : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Full.Bath      : int  1 1 1 2 2 2 2 2 2 2 ...
##  $ Half.Bath      : int  0 0 1 1 1 1 0 0 0 1 ...
##  $ Bedroom.AbvGr  : int  3 2 3 3 3 3 2 2 2 3 ...
##  $ Kitchen.AbvGr  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Kitchen.Qual   : chr  "TA" "TA" "Gd" "Ex" ...
##  $ TotRms.AbvGrd  : int  7 5 6 8 6 7 6 5 5 7 ...
##  $ Functional     : chr  "Typ" "Typ" "Typ" "Typ" ...
##  $ Fireplaces     : int  2 0 0 2 1 1 0 0 1 1 ...
##  $ Fireplace.Qu   : chr  "Gd" "None" "None" "TA" ...
##  $ Garage.Type    : chr  "Attchd" "Attchd" "Attchd" "Attchd" ...
##  $ Garage.Yr.Blt  : int  1960 1961 1958 1968 1997 1998 2001 1992 1995 1999 ...
##  $ Garage.Finish  : chr  "Fin" "Unf" "Unf" "Fin" ...
##  $ Garage.Cars    : int  2 1 1 2 2 2 2 2 2 2 ...
##  $ Garage.Area    : int  528 730 312 522 482 470 582 506 608 442 ...
##  $ Garage.Qual    : chr  "TA" "TA" "TA" "TA" ...
##  $ Garage.Cond    : chr  "TA" "TA" "TA" "TA" ...
##  $ Paved.Drive    : chr  "P" "Y" "Y" "Y" ...
##  $ Wood.Deck.SF   : int  210 140 393 0 212 360 0 0 237 140 ...
##  $ Open.Porch.SF  : int  62 0 36 0 34 36 0 82 152 60 ...
##  $ Enclosed.Porch : int  0 0 0 0 0 0 170 0 0 0 ...
##  $ X3Ssn.Porch    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Screen.Porch   : int  0 120 0 0 0 0 0 144 0 0 ...
##  $ Pool.Area      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Fence          : chr  "None" "MnPrv" "None" "None" ...
##  $ Misc.Val       : int  0 0 12500 0 0 0 0 0 0 0 ...
##  $ Yr.Sold        : int  2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
##  $ Sale.Type      : chr  "WD " "WD " "WD " "WD " ...
##  $ Sale.Condition : chr  "Normal" "Normal" "Normal" "Normal" ...
##  $ SalePrice      : int  215000 105000 172000 244000 189900 195500 213500 191500 236500 189000 ...
##  - attr(*, "na.action")= 'omit' Named int [1:667] 12 15 23 24 25 28 56 58 59 67 ...
##   ..- attr(*, "names")= chr [1:667] "12" "15" "23" "24" ...
# Split Training Set 70/30
train <- sample(2258,1800)
test <- (c(1:2258)[-train])

# Create a data frame with continuous variables only:
num.ames=data.frame(data2[,c(2,3,14:17,23,31,33:35,40:49,51,53,56,58,59,63:68,70,71,74)])

# Checking Data Correlation and Distribution:
plot(SalePrice ~., data = num.ames, subset = train)

# Modeling:

# Create First Model:
fit <- lm(SalePrice ~ Overall.Qual + Year.Built + Year.Remod.Add + BsmtFin.SF.1 + Total.Bsmt.SF + X1st.Flr.SF + Gr.Liv.Area + TotRms.AbvGrd +
            Garage.Yr.Blt + Wood.Deck.SF +  Open.Porch.SF, data = num.ames, subset = train)

# Return model summary of first model:
summary(fit)
## 
## Call:
## lm(formula = SalePrice ~ Overall.Qual + Year.Built + Year.Remod.Add + 
##     BsmtFin.SF.1 + Total.Bsmt.SF + X1st.Flr.SF + Gr.Liv.Area + 
##     TotRms.AbvGrd + Garage.Yr.Blt + Wood.Deck.SF + Open.Porch.SF, 
##     data = num.ames, subset = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -136957  -18417   -1373   16771  230613 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -1.119e+06  9.574e+04 -11.689  < 2e-16 ***
## Overall.Qual    1.966e+04  9.020e+02  21.791  < 2e-16 ***
## Year.Built      1.537e+02  4.895e+01   3.141 0.001714 ** 
## Year.Remod.Add  2.550e+02  5.246e+01   4.860 1.27e-06 ***
## BsmtFin.SF.1    3.251e+01  2.062e+00  15.767  < 2e-16 ***
## Total.Bsmt.SF   2.694e+01  3.472e+00   7.759 1.43e-14 ***
## X1st.Flr.SF     1.274e+01  3.754e+00   3.393 0.000706 ***
## Gr.Liv.Area     6.844e+01  3.282e+00  20.853  < 2e-16 ***
## TotRms.AbvGrd  -1.002e+03  8.885e+02  -1.128 0.259577    
## Garage.Yr.Blt   1.097e+02  5.597e+01   1.960 0.050202 .  
## Wood.Deck.SF    1.843e+01  6.779e+00   2.718 0.006621 ** 
## Open.Porch.SF   1.395e+01  1.291e+01   1.080 0.280151    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32820 on 1788 degrees of freedom
## Multiple R-squared:  0.8528, Adjusted R-squared:  0.8519 
## F-statistic: 941.6 on 11 and 1788 DF,  p-value: < 2.2e-16
# Create Second Model:
fit2 <- lm(SalePrice ~ Overall.Qual + Year.Built + Year.Remod.Add + BsmtFin.SF.1 + Total.Bsmt.SF + X1st.Flr.SF + Gr.Liv.Area + TotRms.AbvGrd +
            Garage.Yr.Blt + Wood.Deck.SF, data = num.ames, subset = train)
# Return model summary of second model:
summary(fit2)
## 
## Call:
## lm(formula = SalePrice ~ Overall.Qual + Year.Built + Year.Remod.Add + 
##     BsmtFin.SF.1 + Total.Bsmt.SF + X1st.Flr.SF + Gr.Liv.Area + 
##     TotRms.AbvGrd + Garage.Yr.Blt + Wood.Deck.SF, data = num.ames, 
##     subset = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -137514  -18485   -1627   16894  231403 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -1.136e+06  9.446e+04 -12.026  < 2e-16 ***
## Overall.Qual    1.966e+04  9.020e+02  21.796  < 2e-16 ***
## Year.Built      1.542e+02  4.895e+01   3.151 0.001656 ** 
## Year.Remod.Add  2.605e+02  5.221e+01   4.991 6.60e-07 ***
## BsmtFin.SF.1    3.251e+01  2.062e+00  15.765  < 2e-16 ***
## Total.Bsmt.SF   2.723e+01  3.462e+00   7.866 6.26e-15 ***
## X1st.Flr.SF     1.256e+01  3.750e+00   3.349 0.000827 ***
## Gr.Liv.Area     6.900e+01  3.241e+00  21.292  < 2e-16 ***
## TotRms.AbvGrd  -1.034e+03  8.881e+02  -1.164 0.244550    
## Garage.Yr.Blt   1.121e+02  5.593e+01   2.004 0.045181 *  
## Wood.Deck.SF    1.775e+01  6.750e+00   2.630 0.008619 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32820 on 1789 degrees of freedom
## Multiple R-squared:  0.8527, Adjusted R-squared:  0.8519 
## F-statistic:  1036 on 10 and 1789 DF,  p-value: < 2.2e-16
# Create Third Model:
fit3 <- lm(SalePrice ~ Overall.Qual + Year.Built + Year.Remod.Add + BsmtFin.SF.1 + Total.Bsmt.SF + X1st.Flr.SF + Gr.Liv.Area + Garage.Yr.Blt + Wood.Deck.SF, data = num.ames, subset = train)
# Return model summary of third model:
summary(fit3)
## 
## Call:
## lm(formula = SalePrice ~ Overall.Qual + Year.Built + Year.Remod.Add + 
##     BsmtFin.SF.1 + Total.Bsmt.SF + X1st.Flr.SF + Gr.Liv.Area + 
##     Garage.Yr.Blt + Wood.Deck.SF, data = num.ames, subset = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -137509  -18564   -1665   17169  229284 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -1.140e+06  9.442e+04 -12.070  < 2e-16 ***
## Overall.Qual    1.974e+04  8.993e+02  21.955  < 2e-16 ***
## Year.Built      1.534e+02  4.895e+01   3.134 0.001749 ** 
## Year.Remod.Add  2.620e+02  5.220e+01   5.020 5.67e-07 ***
## BsmtFin.SF.1    3.285e+01  2.041e+00  16.091  < 2e-16 ***
## Total.Bsmt.SF   2.725e+01  3.462e+00   7.872 5.99e-15 ***
## X1st.Flr.SF     1.258e+01  3.751e+00   3.353 0.000816 ***
## Gr.Liv.Area     6.619e+01  2.160e+00  30.642  < 2e-16 ***
## Garage.Yr.Blt   1.116e+02  5.593e+01   1.995 0.046140 *  
## Wood.Deck.SF    1.787e+01  6.750e+00   2.648 0.008169 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32830 on 1790 degrees of freedom
## Multiple R-squared:  0.8526, Adjusted R-squared:  0.8518 
## F-statistic:  1150 on 9 and 1790 DF,  p-value: < 2.2e-16

Check Model Diagnostics:

# Plot Fitted vs Residuals:
plot(fit3$res~fit3$fitted, main = "Fitted vs Residuals")

# Check normality of model:
hist(fit3$res, main = "Normality Test",
     col = c("blue", "red", "green"))

# Plot qq-plot:
qqnorm((fit3$res))
# add reference line:
qqline(fit3$res)

# Compute Shapiro-Wilk Test for Normality Check:
shapiro.test(fit3$res)
## 
##  Shapiro-Wilk normality test
## 
## data:  fit3$res
## W = 0.93102, p-value < 2.2e-16

Boxcox Transformation:

# Run boxcox transformation to help normalize data:
boxcox(SalePrice~Overall.Qual + Year.Built + Year.Remod.Add + BsmtFin.SF.1 + Total.Bsmt.SF + X1st.Flr.SF + Gr.Liv.Area + TotRms.AbvGrd +
         Garage.Yr.Blt + Wood.Deck.SF, data = num.ames)

# Create new variable that is the log of SalesPrice:
SalePriceLog <- log(num.ames$SalePrice)

# Create new model using SalesPriceLog for the dependent variable:
fit4 <- lm(SalePriceLog~Overall.Qual + Year.Built + Year.Remod.Add + BsmtFin.SF.1 + Total.Bsmt.SF + X1st.Flr.SF + Gr.Liv.Area + TotRms.AbvGrd +
         Garage.Yr.Blt + Wood.Deck.SF, data = num.ames)

# Plot Fitted vs Residual Values:
plot(fit4$res~fit4$fitted, main = "Diagnostic Check Model 4")

# Determine Categorical Variables For Model:

# Using anova: determine the categorical variables to use in the final model:
model1 = lm(SalePriceLog ~ Overall.Qual + Year.Built + Year.Remod.Add + BsmtFin.SF.1 + Total.Bsmt.SF + X1st.Flr.SF + Gr.Liv.Area + Wood.Deck.SF + Open.Porch.SF, data = data2, subset = train)
model2 = lm(SalePriceLog ~ Overall.Qual + Year.Built + Year.Remod.Add + BsmtFin.SF.1 + Total.Bsmt.SF + X1st.Flr.SF + Gr.Liv.Area + Wood.Deck.SF + Open.Porch.SF + Street, data = data2, subset = train)
anova(model1,model2)
str(data2)
## 'data.frame':    2258 obs. of  74 variables:
##  $ MS.Zoning      : chr  "RL" "RH" "RL" "RL" ...
##  $ Lot.Frontage   : int  141 80 81 93 74 78 41 43 39 60 ...
##  $ Lot.Area       : int  31770 11622 14267 11160 13830 9978 4920 5005 5389 7500 ...
##  $ Street         : chr  "Pave" "Pave" "Pave" "Pave" ...
##  $ Alley          : chr  "None" "None" "None" "None" ...
##  $ Lot.Shape      : chr  "IR1" "Reg" "IR1" "Reg" ...
##  $ Land.Contour   : chr  "Lvl" "Lvl" "Lvl" "Lvl" ...
##  $ Lot.Config     : chr  "Corner" "Inside" "Corner" "Corner" ...
##  $ Land.Slope     : chr  "Gtl" "Gtl" "Gtl" "Gtl" ...
##  $ Condition.1    : chr  "Norm" "Feedr" "Norm" "Norm" ...
##  $ Condition.2    : chr  "Norm" "Norm" "Norm" "Norm" ...
##  $ Bldg.Type      : chr  "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ House.Style    : chr  "1Story" "1Story" "1Story" "1Story" ...
##  $ Overall.Qual   : int  6 5 6 7 5 6 8 8 8 7 ...
##  $ Overall.Cond   : int  5 6 6 5 5 6 5 5 5 5 ...
##  $ Year.Built     : int  1960 1961 1958 1968 1997 1998 2001 1992 1995 1999 ...
##  $ Year.Remod.Add : int  1960 1961 1958 1968 1998 1998 2001 1992 1996 1999 ...
##  $ Roof.Style     : chr  "Hip" "Gable" "Hip" "Hip" ...
##  $ Roof.Matl      : chr  "CompShg" "CompShg" "CompShg" "CompShg" ...
##  $ Exterior.1st   : chr  "BrkFace" "VinylSd" "Wood" "BrkFace" ...
##  $ Exterior.2nd   : chr  "Plywood" "VinylSd" "wood" "BrkFace" ...
##  $ Mas.Vnr.Type   : chr  "Stone" "None" "BrkFace" "None" ...
##  $ Mas.Vnr.Area   : int  112 0 108 0 0 20 0 0 0 0 ...
##  $ Exter.Qual     : chr  "TA" "TA" "TA" "Gd" ...
##  $ Exter.Cond     : chr  "TA" "TA" "TA" "TA" ...
##  $ Foundation     : chr  "CBlock" "CBlock" "CBlock" "CBlock" ...
##  $ Bsmt.Qual      : chr  "TA" "TA" "TA" "TA" ...
##  $ Bsmt.Cond      : chr  "Gd" "TA" "TA" "TA" ...
##  $ Bsmt.Exposure  : chr  "Gd" "No" "No" "No" ...
##  $ BsmtFin.Type.1 : chr  "BLQ" "Rec" "ALQ" "ALQ" ...
##  $ BsmtFin.SF.1   : int  639 468 923 1065 791 602 616 263 1180 0 ...
##  $ BsmtFin.Type.2 : chr  "Unf" "LwQ" "Unf" "Unf" ...
##  $ BsmtFin.SF.2   : int  0 144 0 0 0 0 0 0 0 0 ...
##  $ Bsmt.Unf.SF    : int  441 270 406 1045 137 324 722 1017 415 994 ...
##  $ Total.Bsmt.SF  : int  1080 882 1329 2110 928 926 1338 1280 1595 994 ...
##  $ Heating        : chr  "Gas" "Gas" "Gas" "Gas" ...
##  $ Heating.QC     : chr  "Fa" "TA" "TA" "Ex" ...
##  $ Central.Air    : chr  "Y" "Y" "Y" "Y" ...
##  $ Electrical     : chr  "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
##  $ X1st.Flr.SF    : int  1656 896 1329 2110 928 926 1338 1280 1616 1028 ...
##  $ X2nd.Flr.SF    : int  0 0 0 0 701 678 0 0 0 776 ...
##  $ Low.Qual.Fin.SF: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Gr.Liv.Area    : int  1656 896 1329 2110 1629 1604 1338 1280 1616 1804 ...
##  $ Bsmt.Full.Bath : int  1 0 0 1 0 0 1 0 1 0 ...
##  $ Bsmt.Half.Bath : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Full.Bath      : int  1 1 1 2 2 2 2 2 2 2 ...
##  $ Half.Bath      : int  0 0 1 1 1 1 0 0 0 1 ...
##  $ Bedroom.AbvGr  : int  3 2 3 3 3 3 2 2 2 3 ...
##  $ Kitchen.AbvGr  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Kitchen.Qual   : chr  "TA" "TA" "Gd" "Ex" ...
##  $ TotRms.AbvGrd  : int  7 5 6 8 6 7 6 5 5 7 ...
##  $ Functional     : chr  "Typ" "Typ" "Typ" "Typ" ...
##  $ Fireplaces     : int  2 0 0 2 1 1 0 0 1 1 ...
##  $ Fireplace.Qu   : chr  "Gd" "None" "None" "TA" ...
##  $ Garage.Type    : chr  "Attchd" "Attchd" "Attchd" "Attchd" ...
##  $ Garage.Yr.Blt  : int  1960 1961 1958 1968 1997 1998 2001 1992 1995 1999 ...
##  $ Garage.Finish  : chr  "Fin" "Unf" "Unf" "Fin" ...
##  $ Garage.Cars    : int  2 1 1 2 2 2 2 2 2 2 ...
##  $ Garage.Area    : int  528 730 312 522 482 470 582 506 608 442 ...
##  $ Garage.Qual    : chr  "TA" "TA" "TA" "TA" ...
##  $ Garage.Cond    : chr  "TA" "TA" "TA" "TA" ...
##  $ Paved.Drive    : chr  "P" "Y" "Y" "Y" ...
##  $ Wood.Deck.SF   : int  210 140 393 0 212 360 0 0 237 140 ...
##  $ Open.Porch.SF  : int  62 0 36 0 34 36 0 82 152 60 ...
##  $ Enclosed.Porch : int  0 0 0 0 0 0 170 0 0 0 ...
##  $ X3Ssn.Porch    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Screen.Porch   : int  0 120 0 0 0 0 0 144 0 0 ...
##  $ Pool.Area      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Fence          : chr  "None" "MnPrv" "None" "None" ...
##  $ Misc.Val       : int  0 0 12500 0 0 0 0 0 0 0 ...
##  $ Yr.Sold        : int  2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
##  $ Sale.Type      : chr  "WD " "WD " "WD " "WD " ...
##  $ Sale.Condition : chr  "Normal" "Normal" "Normal" "Normal" ...
##  $ SalePrice      : int  215000 105000 172000 244000 189900 195500 213500 191500 236500 189000 ...
##  - attr(*, "na.action")= 'omit' Named int [1:667] 12 15 23 24 25 28 56 58 59 67 ...
##   ..- attr(*, "names")= chr [1:667] "12" "15" "23" "24" ...